#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# vim: tabstop=4 shiftwidth=4 softtabstop=4

# Copyright (C) 2019-2023, GEM Foundation

# OpenQuake is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# OpenQuake is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with OpenQuake.  If not, see <http://www.gnu.org/licenses/>.

import numpy
import shutil
from openquake.baselib import sap, hdf5
from openquake.baselib.writers import write_csv


def strip_dupl(fname, ndigits: int = 5):
    """
    Strip duplicates from a CSV file
    """
    dupl = set()
    arr = hdf5.read_csv(fname, {None: str, 'lon': float, 'lat': float}).array
    for f in ('lon', 'lat'):
        arr[f] = numpy.round(arr[f], ndigits)
    new = []
    for rec in arr:
        key = tuple(rec[['lon', 'lat']])
        if key not in dupl:
            new.append(rec)
            dupl.add(key)
    if len(new) == len(arr):
        print('There are no duplicate sites')
    else:
        shutil.copy(fname, fname + '.bak')
        write_csv(fname, numpy.array(new, arr.dtype))
        print(f'Reduced {fname} from {len(arr)} to {len(new)} sites and saved '
              f'the original in {fname}.bak')

strip_dupl.fname = 'CSV file with fields lon, lat'
strip_dupl.ndigits = 'number of digits to keep'

if __name__ == '__main__':
    sap.run(strip_dupl)
